#
# CleanAndSubsetData.R.
#
# Clean and subset Glassdoor rating data set
# for subsequent plots and analysis.
#
# Seth Hill, sjhill@ucsd.edu
#

#rm(list=ls())
# setwd("SET WD HERE")

# Libraries
library(bit64)
library(data.table)

# Location of the all_reviews.csv data file.
all_reviews_path = file.path(".","all_reviews.csv")

#
## Data from https://www.kaggle.com/datasets/davidgauthier/glassdoor-job-reviews-2
#
nrow = -1#5e05 # for debugging
DT = fread(all_reviews_path,nrow=nrow)

#
# Remove duplicates.
#
cat("\nRemoving duplicates, beginning with",DT[,.N],"observations.\n")
DT[!duplicated(DT)]
cat(" Ending with",DT[,.N],"observations.\n")

#
# Identify mention of bureaucracy in any of three
# open text fields.
#
DT[,title := tolower(title)] # lower case for string matching
# Presence of bureau string.
DT[,bureau_title := 1*(grepl(".*ureaucr.*",title))]
# Count of bureau string.
DT[,bureau_title_count := lengths(gregexpr("bureau\\w*", title, ignore.case = TRUE))]
# Hack to set to zero b/c gregexpr returns -1 when no match.
DT[bureau_title == 0, bureau_title_count := 0]
print(DT[,table(bureau_title_count)])

# In pro and con lists.
DT[,bureau_pros := 1*(grepl(".*ureaucr.*",pros))]
DT[,bureau_cons := 1*(grepl(".*ureaucr.*",cons))]
DT[,bureau_any := as.numeric(rowSums(.SD) > 0), .SDcols = c("bureau_title", "bureau_pros", "bureau_cons")]

# Comparison strings to benchmark magnitude of bureaucracy effect.
DT[,lowpay_title := 1*(grepl(".*low pay.*",title))]
DT[,longhours_title := 1*(grepl(".*long hours.*",title))]
DT[,conflict_title := 1*(grepl(".*conflict.*",title))]
DT[,stress_title := 1*(grepl(".*stress.*",title))]

# Recode comp benefits to numeric.
setnames(DT,"Compensation and Benefits","CompAndBenefits")
DT = merge(DT,data.table(CompAndBenefits=c("1", "1.0", "2", "2.0", "3", "3.0", "4", "4.0", "5", "5.0"),comp_benefits=c(1,1,2,2,3,3,4,4,5,5)),by="CompAndBenefits")
DT[,table(CompAndBenefits,comp_benefits)]
DT[,firm_name := sub(".*Reviews/([^-]+(?:-[^-]+)*)-Reviews-.*", "\\1", firm_link)]


#################################
# Random sample of reviews for Table S2.
#################################
cat("Random sample of text from reviews mentioning string `bureau':\n")
set.seed(202503011)
zed = DT[bureau_title == 1, .SD[sample(.N, min(10, .N))], .SDcols = c("firm_name", "title")]
fwrite(zed, "Table_S2.tex", sep = "&", col.names = FALSE, quote = FALSE)

#
# Recodes.
#
DT[,overall := as.numeric(rating)]
# Extract firm names, tabulate # reviews, and create firm fixed effects.
DT[,firm_n_reviews := .N,by="firm_name"]
# Firm FEs for any firm with at least 50 reviews.
DT[,firm_name2 := ifelse(firm_n_reviews < 50,"POOLED SMALL FIRMS",firm_name)]
DT[,firm_id := .GRP,by="firm_name2"]
cat("Total firms in analysis:\n")
DT[,print(max(firm_id))]


# Firm average bureaucracy and benefit evaluations.
DT[,firm_bureau_title := mean(bureau_title,na.rm=T),by="firm_id"]
DT[,firm_bureau_pros := mean(bureau_pros,na.rm=T),by="firm_id"]
DT[,firm_bureau_cons := mean(bureau_cons,na.rm=T),by="firm_id"]
DT[,firm_bureau_any := mean(bureau_any,na.rm=T),by="firm_id"]
DT[,firm_comp_benefits := mean(comp_benefits,na.rm=T),by="firm_id"]
# Employee status.
DT[,relationship_type := ifelse(grepl("Current Employee",status),"Current Employee", ifelse(grepl("Former Employee",status),"Former Employee", "Other"))]

# Drop string variables for memory reasons.
DT = DT[,-c("CompAndBenefits","title","pros","cons","firm_link","firm_name","status")]
DT[, date2 := as.IDate(date, format = "%b %d, %Y")]
DT[, year  := as.numeric(format(date2,"%Y"))]


# Aggregate bureau mentions to unique date2-firm_id. Exclude small firms.
DT2 = DT[firm_name2 != "POOLED SMALL FIRMS",lapply(.SD,sum),by=c("date2","firm_id"),.SDcols=c("bureau_title", "bureau_pros", "bureau_cons", "bureau_any", "overall")]
# Join count by date.
DT2 = merge(DT2,DT[firm_name2 != "POOLED SMALL FIRMS",.N,by=c("date2","firm_id")],by=c("date2","firm_id"),all=T)
check = DT2[,sum(bureau_any,na.rm=T)]

# Compute forward averages for following columns.
the_cols = c("N","bureau_title", "bureau_pros", "bureau_cons", "bureau_any", "overall")
  ###########
  cat("\n Rolling 365-day forward average bureaucracy mentions by firm for each
   respondent's interview date.\n");flush.console()
  ###########

  # rbind pseudo-observations for each date2-firm_id with 0s for all counts.
  # Create a sequence of all dates from min to max in the data
  all_dates <- data.table(date2 = seq.Date(from = min(DT2$date2),to = max(DT2$date2),by = "day"))
  # Get unique firms
  firms <- unique(DT2$firm_id)
  # Create complete cross join of dates and firms
  complete_dt <- all_dates[, .(date2 = date2,firm_id = rep(firms, each = .N))]
  # rbind.
  DT2 = rbind(DT2,complete_dt,use.names=T,fill=T)
  rm(all_dates,firms,complete_dt);gc()
  # Sum to the date2-firm_id to get back to one observation for every date2-firm_id.
  DT2 = DT2[,lapply(.SD,sum,na.rm=T),by=c("firm_id","date2"),.SDcols=the_cols]
  # Check summing.
  if (check != DT2[,sum(bureau_any,na.rm=T)]) {
    stop("something not right")
  }

  # Frollsum .SDcols from date2+1 through date2+366.
  setkey(DT2,firm_id,date2)
  new_names = paste0("forward_sum_", the_cols)
  DT2[,(new_names) := lapply(.SD, function(x) frollsum(x, n=366, align="left")), by="firm_id", .SDcols=the_cols]
  
# Subtract today's values so that forward values exclude current respondents.
DT2[, `:=`(forward_sum_N = forward_sum_N - N, forward_sum_bureau_title = forward_sum_bureau_title - bureau_title, forward_sum_bureau_pros = forward_sum_bureau_pros - bureau_pros, forward_sum_bureau_cons = forward_sum_bureau_cons - bureau_cons, forward_sum_bureau_any = forward_sum_bureau_any - bureau_any, forward_sum_overall = forward_sum_overall - overall)]
# Calculate means.
DT2[, `:=`(
  forward_avg_title = forward_sum_bureau_title / forward_sum_N,
  forward_avg_pros = forward_sum_bureau_pros / forward_sum_N,
  forward_avg_cons = forward_sum_bureau_cons / forward_sum_N,
  forward_avg_any = forward_sum_bureau_any / forward_sum_N,
  forward_avg_overall = forward_sum_overall / forward_sum_N
)]
# Code year.
DT2[, year  := as.numeric(format(date2,"%Y"))]

# Merge forward averages to DT.
setkey(DT,firm_id,date2)
DT = merge(DT,DT2[,c("date2","firm_id","forward_sum_N","forward_avg_title", "forward_avg_pros", "forward_avg_cons", "forward_avg_any")],by=c("firm_id","date2"),all.x=T)
if (nrow == -1) {
  # Write out clean csv for Stata analysis.
  fwrite(DT[,c("firm_id", "date2", "bureau_title", "bureau_title_count", "bureau_pros", "bureau_cons", "bureau_any", "lowpay_title", "longhours_title", "conflict_title", "stress_title", "comp_benefits", "overall", "firm_n_reviews", "firm_name2", "firm_bureau_title", "firm_bureau_pros", "firm_bureau_cons", "firm_bureau_any", "firm_comp_benefits", "forward_sum_N", "forward_avg_title", "forward_avg_pros", "forward_avg_cons", "forward_avg_any")], file="all_reviews_cleaned.csv")
}

# Sum across firms to the date.
DT3 = DT2[,lapply(.SD,sum,na.rm=T),by="date2",.SDcols=c("forward_sum_N", "forward_sum_bureau_title", "forward_sum_overall")]
# Calculate means.
DT3[, `:=`(
  forward_avg_title = forward_sum_bureau_title / forward_sum_N,
  forward_avg_overall = forward_sum_overall / forward_sum_N
)]

# Flip overall into dissatisfaction.
DT3[,dissat := 5 - forward_avg_overall]

if (nrow == -1) {
  # Write out DT3 for plotting in R.
  fwrite(DT3,file="reviews_by_date.csv")
}
